/***************************************************************************************
Firm-embedded productivity and cross-country income differences
Alviarez, Cravino and Ramondo
Journal of Political Economy (2022)

Program: clean_01_orbis.do
Date: October 2022

Description: Clean the raw Orbis data for firm level analysis 

*****************************************************************************************/
clear all
set more off
global typeden=0 
do "set_directories.do"



*=======================================================================
*Ceaning ORBIS firm level data containing MNCs 
*=======================================================================
use "${intermediate}/orbis-mult_short_final_all_links_20200109.dta", clear
gen isocode=substr(bvd_id,1,2)
gen hq=substr(guo_50,1,2)
	  

*Eliminate observations without revenue or employment information
*----------------------------------------------------------------
gen data_noavailable=((turn==. | turn==0) & (emp==. | emp==0))
tab data_noavailable
drop if data_noavailable==1
drop data_noavailable


*Eliminate observations with negative revenues 
*----------------------------------------------------------------
gen data_negative=(turn<0)
tab data_negative
drop if data_negative==1
drop data_negative


*Deal with repeated observations 
*----------------------------------------------------------------
drop duplicates 
duplicates tag bvd_id year, gen(dupli)
tab dupli

gen data_rev=(turn!=. & turn!=0)
gen data_emp=(emp!=. & emp!=0)
by bvd_id year, sort: egen data_rev_max=max(data_rev)
by bvd_id year, sort: egen data_emp_max=max(data_emp)

gen gturn=-turn
sort bvd_id year gturn
by bvd_id year, sort: gen a=_n
drop if dupli==1 & data_rev_max==1 & a>1
drop dupli a gturn
duplicates tag bvd_id year, gen(dupli)

gen gemp=-emp
sort bvd_id year gemp
by bvd_id year, sort: gen a=_n
drop if dupli==1 & data_emp_max==1 & a>1
drop dupli a gemp
duplicates tag bvd_id year, gen(dupli)
drop dupli


*Keep MNCs only 
*----------------------------------------------------------------
by year guo_50 isocode, sort: gen a=_n==1
tab a
by year guo_50, sort: egen num_iso=total(a)
drop a
gen MNC=(num_iso>1)
tab MNC
tab MNC if isocode!=hq
drop if MNC==0 & isocode==hq
tab MNC
drop if MNC==0 & isocode!=hq
drop MNC
compress
tempfile temp0
save `temp0', replace 


*Keep and rename key variables  
*----------------------------------------------------------------
use `temp0', clear
keep bvd_id con_code turn emp year guo_50 guo_50c country_code_iso entity industry_code entity_guo50 industry_code_guo50 entity_guo50c industry_code_guo50c exp_rev val_added year_incorp_date isocode hq

rename bvd_id id_bvd
rename guo_50 guo_bvd 
rename guo_50c guo_bvdc

rename entity entity_id
rename entity_guo50 entity_guo
rename entity_guo50c entity_guoc

rename industry_code NAICS
rename industry_code_guo50 NAICS_guo
rename industry_code_guo50c NAICS_guoc

rename turn sales
rename exp_rev exports 
rename val_added va
rename year_incorp_date year_first
gen age=year-year_first
replace age=. if age<0
drop year_first
gen hqc=substr(guo_bvdc,1,2)

label var guo_bvdc "GUO (min 50%) only with type B, C, A and F"
label var guo_bvd "GUO (min 50%)"


*Entity Type: 
*--------------------------------------------------------
*B: Bank 
*F: Finacial Company 
*A: Insurance Company 
*C: Industrial Company
*E: Mutual & Pension Fund/Nominee/Trust/Trustee
*J: Foundation/Research Institute
*S: Public authorities, States, Governments
*I: One or more known individuals or families
*M: Employees/Managers/Directors
*H: Self ownership
*P: Private equity firms
*Z: Public
*D: Unnamed private shareholders, aggregated
*L: Other unnamed shareholders, aggregated
*V: Venture capital
*Y: Hedge fund
*Q: Branch 
*W: Mrine Vessels 

replace hq=hqc if (hq=="WW" | hq=="YY" | hq=="ZZ" | hq=="XX" | hq=="" )

*Use guo (as above), but use the mode hqc (to scape tax haven situations) 
by guo_bvd, sort: egen xxx_mode=mode(hqc), maxmode
gen hq2=xxx_mode
replace hq2=hq if hq2==""
drop hq
rename hq2 hq
drop xxx_mode

*Use guoc and hqc (and guo/hq if not available) 
replace guo_bvdc=guo_bvd if guo_bvdc==""
replace hqc=hq if hqc==""

global cond1 `"  isocode=="AR" | isocode=="AT" | isocode=="AU" | isocode=="BE" | isocode=="BG" | isocode=="BR" | isocode=="CA" |  isocode=="CH" | isocode=="CL" | isocode=="CN" | isocode=="CY" | isocode=="CO" | isocode=="CZ" |  isocode=="DE" |  isocode=="DK" |  isocode=="EE" |  isocode=="ES" |  isocode=="FI" |  isocode=="FR" |  isocode=="GB" |  isocode=="GR" |  isocode=="HK" |  isocode=="HU" |  isocode=="HR" |  isocode=="ID" |  isocode=="IL" |  isocode=="IE" | isocode=="IN" |  isocode=="IS" |  isocode=="IT" |  isocode=="JP" |  isocode=="KR" | isocode=="LT" | isocode=="LV" | isocode=="LU" |  isocode=="MX" |  isocode=="MT" |  isocode=="MY" |  isocode=="NL" |   isocode=="NO" |  isocode=="NZ" |  isocode=="PE" |  isocode=="PL" |  isocode=="PT" |  isocode=="RO" |  isocode=="RS" |   isocode=="RU" | isocode=="SE" |  isocode=="SG" | isocode=="SI" | isocode=="SK" | isocode=="TH" | isocode=="TR" | isocode=="TW" | isocode=="UA" |  isocode=="US" | isocode=="VE" |  isocode=="ZA"   "'
gen a=($cond1)
tab a
keep if a==1
drop a
compress
order year isocode id_bvd NAICS guo_bvd hq guo_bvdc hqc isocode sales emp va exports age


*Sector classification* 
*--------------------------------------------------------------------
tostring NAICS*, replace
replace NAICS_guo="" if NAICS_guo=="."
replace NAICS_guoc="" if NAICS_guoc=="."
replace NAICS="" if NAICS=="."

rename NAICS NAICS4
gen NAICS2=substr(NAICS4,1,2)
gen NAICS3=substr(NAICS4,1,3)

gen sector="NA"
replace sector="NA" if NAICS2=="" | NAICS2=="NA"
replace sector="Agriculture (A)" if NAICS2=="11"
replace sector="Mining (B)" if NAICS2=="21"

replace sector="Food (10-12)" if NAICS3=="311" | NAICS3=="312"
replace sector="Textiles (13-15)" if NAICS3=="313" | NAICS3=="314" | NAICS3=="315" | NAICS3=="316"
replace sector="Wood (16-18)" if NAICS3=="321" | NAICS3=="322" | NAICS3=="323"
replace sector="Petroleum (19)" if NAICS3=="324"
replace sector="Chemicals (20-21)" if NAICS3=="325"
replace sector="Plastic (22-23)" if NAICS3=="326" | NAICS3=="327"
replace sector="Basic Metals (24-25)" if NAICS3=="331" | NAICS3=="332"
replace sector="Electrical_Equip (26-27)" if NAICS3=="334" |  NAICS3=="335" 
replace sector="Machinery (28)" if NAICS3=="333" 
replace sector="Transport (29-30)" if NAICS3=="336"
replace sector="Other_Manufacturing (31-33)" if NAICS3=="337" | NAICS3=="339"

replace sector="Electricity (D-E)" if NAICS2=="22"
replace sector="Construction (F)" if NAICS2=="23"
replace sector="Wholesale_Retail (G)" if NAICS2=="41" | NAICS2=="42" | NAICS2=="44" | NAICS2=="45"
replace sector="Transportation_Storage (H)" if NAICS2=="48" | NAICS2=="49"
replace sector="Accomodation (I)" if NAICS2=="72"
replace sector="Information (J)" if  NAICS2=="51"
replace sector="Financial_Insurance (K)" if  NAICS2=="52"
replace sector="Real_Estate (L)" if  NAICS2=="53"
replace sector="Support_Services (M-N)" if NAICS2=="54" | NAICS2=="55" | NAICS2=="56"
replace sector="Public_Administration (O)" if NAICS2=="92"
replace sector="Education (P)" if  NAICS2=="61"
replace sector="Health (Q)" if  NAICS2=="62"
replace sector="Recreation (R-S)" if NAICS2=="71" | NAICS2=="81"

tab sector 
compress
tempfile temp1
save `temp1', replace


*Manufacturing* 
*------------------------------
clear all
use `temp1', clear
gen sector1="NA"
replace sector1="Manufacturing (C)" if sector=="Food (10-12)"
replace sector1="Manufacturing (C)" if sector=="Textiles (13-15)"
replace sector1="Manufacturing (C)" if sector=="Wood (16-18)"
replace sector1="Manufacturing (C)" if sector=="Petroleum (19)"
replace sector1="Manufacturing (C)" if sector=="Chemicals (20-21)"
replace sector1="Manufacturing (C)" if sector=="Plastic (22-23)"
replace sector1="Manufacturing (C)" if sector=="Basic Metals (24-25)"
replace sector1="Manufacturing (C)" if sector=="Electrical_Equip (26-27)"
replace sector1="Manufacturing (C)" if sector=="Machinery (28)"
replace sector1="Manufacturing (C)" if sector=="Transport (29-30)"
replace sector1="Manufacturing (C)" if sector=="Other_Manufacturing (31-33)"


*Marketing Services
*------------------------------
replace sector1="Market_Services (G-H-I-J-K-M-N-R-S-T)" if sector=="Wholesale_Retail (G)"
replace sector1="Market_Services (G-H-I-J-K-M-N-R-S-T)" if sector=="Transportation_Storage (H)"
replace sector1="Market_Services (G-H-I-J-K-M-N-R-S-T)" if sector=="Accomodation (I)"
replace sector1="Market_Services (G-H-I-J-K-M-N-R-S-T)" if sector=="Information (J)"
replace sector1="Market_Services (G-H-I-J-K-M-N-R-S-T)" if sector=="Financial_Insurance (K)"
replace sector1="Market_Services (G-H-I-J-K-M-N-R-S-T)" if sector=="Support_Services (M-N)"
replace sector1="Market_Services (G-H-I-J-K-M-N-R-S-T)" if sector=="Recreation (R-S)"
replace sector1="Market_Services (G-H-I-J-K-M-N-R-S-T)" if sector=="Activity_households (T)"

*Other Goods
*------------------------------
replace sector1="Other_Goods (A-B-D-E-F)" if sector=="Agriculture (A)"
replace sector1="Other_Goods (A-B-D-E-F)" if sector=="Mining (B)"
replace sector1="Other_Goods (A-B-D-E-F)" if sector=="Electricity (D-E)"
replace sector1="Other_Goods (A-B-D-E-F)" if sector=="Construction (F)"


*Non Market Economy 
*------------------------------
replace sector1="Non-Market Economy" if sector!="NA" & sector1=="NA"


*Aggregating some sectors 
*------------------------------
replace sector="TexWood (13-18)" if sector=="Textiles (13-15)" | sector=="Wood (16-18)"
replace sector="ChePetPla (19-23)" if sector=="Chemicals (20-21)" | sector=="Petroleum (19)" | sector=="Plastic (22-23)"
replace sector="ElecMach (26-28)" if sector=="Electrical_Equip (26-27)" | sector=="Machinery (28)"
replace sector="TranspOtherManuf (29-33)" if sector=="Transport (29-30)" | sector=="Other_Manufacturing (31-33)"
replace sector="AgrMining (A-B)" if sector=="Agriculture (A)" | sector=="Mining (B)"
replace sector="Accomod_Recreat (I-R-S)" if sector=="Accomodation (I)" | sector=="Recreation (R-S)"
drop if sector=="NA" & sector1=="NA"

drop if isocode=="CY" | isocode=="ID" | isocode=="IL"| isocode=="MT" | isocode=="MY" | isocode=="VE" | isocode=="ZA"   

foreach vvv in sales va exports {
display "`vvv'"
*From USD to MM USD 
replace `vvv'=`vvv'/1000000
format %9.1fc `vvv' 
}
*From number to thousands of employees 
replace emp=emp/1000
format %9.1fc emp 

label var sales "Operating Revenue (MM USD)"
label var exports "Exports (MM USD)"
label var va "Value Added (MM USD)"
label var emp "Number of employees (th)"

drop if (sales==. | sales==0) &  (emp==. | emp==0) & (va==. | va==0) &  (exports==. | exports==0)
keep year isocode id_bvd guo_bvd hq NAICS2 NAICS3 NAICS4 sector sector1 sales emp va exports con_code
order year isocode id_bvd guo_bvd hq NAICS2 NAICS3 NAICS4 sector sector1 sales emp va exports con_code
sort year isocode id_bvd sector

compress
save "${data}/firm_allyears_sales.dta", replace


